library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaflet)
library(sf)
## Warning: package 'sf' was built under R version 4.1.2
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(raster)
## Warning: package 'raster' was built under R version 4.1.2
## Loading required package: sp
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
library(sp)
library(diffdf)
library(ggplot2)
let’s load the 2019 data
file_name <- "../data/bluebikes_tripdata_2019.csv"
# read
data_2019 <- read_csv(file_name, progress = FALSE)
## Rows: 2522771 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): start station name, end station name, usertype
## dbl (12): tripduration, start station id, start station latitude, start sta...
## dttm (2): starttime, stoptime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2019 <- data_2019 %>% rename("start_long" = "start station longitude",
"start_lat" = "start station latitude",
"end_long" = "end station longitude",
"end_lat" = "end station latitude",
"start_station_id" = "start station id",
"start_station_name" = "start station name",
"end_station_name" = "end station name",
"birth_year" = "birth year")
Clean the 2019 data
# select the columns I need
cleaned_2019 <- data_2019 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))
cleaned_2019
## # A tibble: 2,522,771 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 790 2019-12-01 00:01:25 2019-12-01 00:14:35 42.4 -71.1
## 2 166 2019-12-01 00:05:42 2019-12-01 00:08:29 42.4 -71.1
## 3 323 2019-12-01 00:08:28 2019-12-01 00:13:52 42.4 -71.1
## 4 709 2019-12-01 00:08:38 2019-12-01 00:20:27 42.4 -71.1
## 5 332 2019-12-01 00:10:08 2019-12-01 00:15:41 42.4 -71.1
## 6 507 2019-12-01 00:14:26 2019-12-01 00:22:53 42.4 -71.1
## 7 794 2019-12-01 00:17:46 2019-12-01 00:31:01 42.4 -71.1
## 8 1354 2019-12-01 00:18:19 2019-12-01 00:40:54 42.4 -71.1
## 9 1227 2019-12-01 00:19:01 2019-12-01 00:39:29 42.3 -71.1
## 10 1068 2019-12-01 00:24:41 2019-12-01 00:42:29 42.3 -71.1
## # … with 2,522,761 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
Select the unique entries of start stations
unique_start <- cleaned_2019 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start <- unique_start[!(unique_start$start_long == 0.0000|unique_start$start_lat == 0.0000), ]
# other outliers
unique_start <- unique_start %>% filter(start_station_name != "BCBS Hingham")
unique_start <- unique_start %>% filter(start_station_name != "Main St at Beacon St")
I do the same for the end stations
unique_end <- cleaned_2019 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()
# I do the same for the end stations
unique_end <- unique_end[!(unique_end$end_long == 0.0000|unique_end$end_lat == 0.000), ]
# another outlier
unique_end <- unique_end %>% filter(end_station_name != "BCBS Hingham")
Let’s do the same for the 2020 data
file_name <- "../data/bluebikes_tripdata_2020.csv"
# read
data_2020 <- read_csv(file_name, progress = FALSE)
## Rows: 1999446 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): start station name, end station name, usertype, postal code
## dbl (12): tripduration, start station id, start station latitude, start sta...
## dttm (2): starttime, stoptime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2020 <- data_2020 %>% rename("start_long" = "start station longitude",
"start_lat" = "start station latitude",
"end_long" = "end station longitude",
"end_lat" = "end station latitude",
"start_station_id" = "start station id",
"start_station_name" = "start station name",
"end_station_name" = "end station name",
"birth_year" = "birth year")
Let’s clean the 2020 data
cleaned_2020 <- data_2020 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))
cleaned_2020
## # A tibble: 1,999,446 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 1793 2020-11-01 00:00:18 2020-11-01 00:30:12 42.3 -71.0
## 2 1832 2020-11-01 00:00:34 2020-11-01 00:31:07 42.3 -71.0
## 3 262 2020-11-01 00:01:54 2020-11-01 00:06:17 42.3 -71.0
## 4 419 2020-11-01 00:04:00 2020-11-01 00:10:59 42.4 -71.1
## 5 275 2020-11-01 00:04:01 2020-11-01 00:08:36 42.4 -71.1
## 6 684 2020-11-01 00:04:39 2020-11-01 00:16:03 42.3 -71.1
## 7 608 2020-11-01 00:04:43 2020-11-01 00:14:52 42.4 -71.1
## 8 438 2020-11-01 00:05:57 2020-11-01 00:13:15 42.4 -71.1
## 9 541 2020-11-01 00:07:15 2020-11-01 00:16:16 42.4 -71.1
## 10 1138 2020-11-01 00:07:17 2020-11-01 00:26:16 42.4 -71.1
## # … with 1,999,436 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
unique_start_2020 <- cleaned_2020 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start_2020 <- unique_start_2020[!(unique_start_2020$start_long == 0.0000|unique_start_2020$start_lat == 0.0000), ]
# another outlier
unique_start_2020 <- unique_start_2020 %>% filter(start_station_name != "BCBS Hingham")
unique_end_2020 <- cleaned_2020 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()
unique_end_2020 <- unique_end_2020[!(unique_end_2020$end_long == 0.0000|unique_end_2020$end_lat == 0.0000), ]
unique_end_2020 <- unique_end_2020 %>% filter(end_station_name != "BCBS Hingham")
Top routes in 2019
# start stations 2019
unique_start %>% dplyr::select(start_station_name) %>% unique() %>% count() # 359
## # A tibble: 1 × 1
## n
## <int>
## 1 359
# get the station names
stations_2019 <- table(data_2019$start_station_name)
# convert to tibble
stations_2019_df <- as.data.frame(stations_2019) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df$Freq)
## [1] 13 237 233 20 355 232 300 347 82 123 236 150 86 88 246 247 160 345
## [19] 187 222 309 318 234 46 225 260 14 8 162 361 34 161 214 93 99 343
## [37] 342 155 315 5 69 306 51 133 154 258 311 322 196 330 33 92 85 39
## [55] 288 156 354 223 165 216 193 2 40 333 313 159 114 332 47 131 335 27
## [73] 274 102 146 41 145 157 310 104 316 334 182 26 340 360 273 242 254 98
## [91] 136 66 151 134 294 125 54 277 281 31 143 204 312 50 148 138 12 265
## [109] 287 9 329 73 344 68 140 135 249 121 262 43 231 314 348 15 10 336
## [127] 132 158 115 137 181 58 110 362 57 279 32 224 107 270 153 101 45 90
## [145] 139 350 127 28 87 217 124 351 67 352 198 18 266 72 243 280 245 152
## [163] 226 144 42 189 308 111 177 295 255 290 149 105 186 17 70 346 304 7
## [181] 141 49 23 219 363 213 235 211 197 16 220 286 64 221 202 194 303 184
## [199] 74 215 11 37 264 65 239 240 167 75 285 164 19 349 116 293 147 190
## [217] 326 267 188 3 195 206 94 71 299 261 112 250 106 324 283 251 191 212
## [235] 358 52 359 185 317 272 122 305 166 176 296 253 56 63 208 289 117 100
## [253] 113 292 323 302 268 169 179 327 109 103 142 38 337 4 168 339 259 35
## [271] 118 301 44 269 297 271 338 25 171 199 76 357 91 276 126 248 59 178
## [289] 1 356 320 95 257 328 77 319 307 175 128 275 78 21 203 79 325 81
## [307] 291 180 192 321 55 183 163 174 170 129 6 172 263 119 96 209 341 278
## [325] 130 201 29 24 282 256 207 60 53 210 244 284 62 238 205 331 353 48
## [343] 218 108 120 80 89 36 83 30 252 61 97 230 228 173 241 200 22 298
## [361] 229 84 227
# select the top 36
top36_2019 <- stations_2019_df[c(24, 282, 256, 207, 60, 53, 210, 244, 284, 62, 238, 205, 331, 353, 48, 218, 108, 120, 80, 89, 36, 83, 30, 252, 61, 97, 230, 228, 173, 241, 200, 22, 298, 229, 84, 227),]
# rename station name column
top36_2019 <- rename(top36_2019,
"start_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019 <- left_join(top36_2019, unique_start, by = "start_station_name")
# somehow I got 41 stations from this, but let's plot them anyway
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat)
# end stations 2019
unique_end %>% dplyr::select(end_station_name) %>% unique() %>% count() # 360
## # A tibble: 1 × 1
## n
## <int>
## 1 360
# get the station names
stations_2019_end <- table(data_2019$end_station_name)
# convert to tibble
stations_2019_df_end <- as.data.frame(stations_2019_end) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df_end$Freq)
## [1] 13 237 341 233 20 232 356 348 300 82 86 88 123 150 160 236 346 247
## [19] 187 246 234 318 309 225 222 46 155 34 14 69 260 315 344 362 214 8
## [37] 93 162 99 156 161 343 133 154 5 306 51 258 322 355 288 193 33 330
## [55] 165 196 39 85 311 92 313 40 332 333 47 216 223 102 114 131 146 121
## [73] 157 335 27 159 145 104 294 41 242 182 316 274 334 310 340 98 361 273
## [91] 26 136 125 66 254 151 134 54 2 143 277 349 31 314 281 138 204 50
## [109] 148 312 12 140 287 231 9 265 68 135 345 329 73 43 262 110 249 10
## [127] 15 336 132 158 58 67 137 181 115 32 57 279 347 363 107 224 270 295
## [145] 101 45 90 139 280 111 153 144 352 217 87 351 255 266 353 127 70 42
## [163] 198 226 124 152 243 18 177 189 186 245 290 308 49 211 17 37 72 286
## [181] 197 213 304 364 141 219 23 64 105 7 19 235 16 220 149 28 293 202
## [199] 75 221 285 184 303 194 11 215 240 188 65 167 74 164 112 350 261 239
## [217] 147 116 267 326 3 94 195 113 250 206 71 212 360 264 283 324 317 251
## [235] 106 190 52 191 185 359 122 299 176 253 305 339 296 63 272 323 208 302
## [253] 169 292 179 289 109 56 100 327 268 38 142 118 103 337 297 117 4 168
## [271] 44 35 269 259 301 271 166 91 171 126 276 199 76 338 178 358 357 77
## [289] 248 163 320 1 319 59 257 25 95 175 6 192 307 328 79 21 78 275
## [307] 203 183 325 291 128 321 81 180 174 263 342 172 55 96 119 256 129 170
## [325] 29 201 278 282 207 24 130 284 331 60 53 244 209 62 210 238 205 218
## [343] 48 354 108 80 89 36 252 120 30 83 61 97 230 200 228 173 298 229
## [361] 241 22 84 227
# select the top 36
top36_2019_end <- stations_2019_df_end[c(207, 24, 130, 284, 331, 60, 53, 244, 209, 62, 210, 238, 205, 218, 48, 354, 108, 80, 89, 36, 252, 120, 30, 83, 61, 97, 230, 200, 228, 173, 298, 229, 241, 22, 84, 227),]
# rename station name column
top36_2019_end <- rename(top36_2019_end,
"end_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019_end <- left_join(top36_2019_end, unique_end, by = "end_station_name")
# transform to sf
coords_start_2019 <- top36_2019 %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)
coords_end_2019 <- top36_2019_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)
# get the coordinates
coords_2019 <- cbind(st_coordinates(coords_start_2019$geometry),st_coordinates(coords_end_2019$geometry))
# get the routes
linestrings_2019 = st_sfc(
lapply(1:nrow(coords_2019),
function(i){
st_linestring(matrix(coords_2019[i,],ncol=2,byrow=TRUE))
}))
# give them a crs
linestrings_2019 <- linestrings_2019 %>% st_set_crs(4326)
plot(linestrings_2019)
# get average length of linestring
st_length(linestrings_2019) %>% mean() # 1494.143 m = 1.5 km
## 1494.143 [m]
# plot the routes on a map
leaflet(linestrings_2019) %>% addTiles() %>%
setView(lng = -71.08083, lat = 42.361145, zoom = 13) %>%
addPolylines() %>%
addCircleMarkers(lng = top36_2019_end$end_long, lat = top36_2019_end$end_lat, color = "red", popup = paste0("<strong>End station: </strong>", top36_2019_end$end_station_name)) %>%
addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat, popup = paste0("<strong>Start station: </strong>", top36_2019$start_station_name)) %>% addScaleBar()
Let’s do the same for the top routes in 2020
# start stations 2020
unique_start_2020 %>% dplyr::select(start_station_name) %>% unique() %>% count() # 384
## # A tibble: 1 × 1
## n
## <int>
## 1 384
stations_2020_start <- table(data_2020$start_station_name)
stations_2020_df_start <- as.data.frame(stations_2020_start) %>% as_tibble()
order(stations_2020_df_start$Freq)
## [1] 253 33 288 49 287 208 382 215 95 134 34 184 232 364 30 227 381 349
## [19] 263 6 231 281 168 154 159 176 118 73 174 250 340 229 277 234 329 360
## [37] 2 48 25 238 92 367 375 257 75 18 265 158 362 138 320 365 94 311
## [55] 252 109 374 338 62 186 167 293 50 42 107 353 221 124 358 150 331 53
## [73] 239 13 337 286 88 166 292 99 259 245 36 43 24 334 307 378 149 74
## [91] 352 155 57 197 172 384 264 140 98 26 136 163 164 314 160 71 8 332
## [109] 170 142 205 70 244 296 147 335 199 129 191 192 300 330 350 380 361 10
## [127] 249 161 333 60 31 260 126 354 203 241 143 104 12 152 52 44 37 326
## [145] 267 315 141 72 279 363 144 91 80 371 139 273 128 304 76 289 54 298
## [163] 308 196 157 114 200 385 93 224 137 207 242 272 216 145 368 78 370 383
## [181] 226 228 77 7 282 256 61 235 306 131 240 283 162 32 262 366 299 165
## [199] 291 251 17 386 16 27 206 313 21 81 302 15 39 171 372 346 106 40
## [217] 379 201 69 321 225 271 59 14 87 68 322 3 194 148 369 46 187 110
## [235] 41 4 218 151 11 117 278 198 121 230 323 336 285 173 309 185 120 325
## [253] 209 122 220 105 175 319 23 119 204 116 169 294 328 156 276 189 324 101
## [271] 327 344 180 355 79 111 153 108 348 343 58 310 317 82 55 376 177 269
## [289] 86 9 47 290 84 133 146 341 268 183 100 179 305 312 237 83 357 195
## [307] 255 280 236 67 345 113 188 347 97 63 316 178 56 213 202 181 303 212
## [325] 339 66 377 359 28 284 275 115 29 130 19 342 211 1 217 193 45 51
## [343] 135 5 35 125 295 356 22 132 351 190 270 318 64 233 266 102 123 297
## [361] 214 301 373 112 261 210 219 85 89 223 222 254 258 248 274 247 182 38
## [379] 246 127 65 20 103 243 96 90
top38_2020_start <- stations_2020_df_start[c(22, 132, 351, 190, 270, 318, 64, 233, 266, 102, 123, 297, 214, 301, 373, 112, 261, 210, 219, 85, 89, 223, 222, 254, 258, 248, 274, 247, 182, 38, 246, 127, 65, 20, 103, 243, 96, 90),]
top38_2020_start <- top38_2020_start %>% rename("start_station_name" = "Var1")
top38_2020_start <- left_join(top38_2020_start, unique_start_2020, by = "start_station_name")
# end stations 2020
unique_end_2020 %>% dplyr::select(end_station_name) %>% unique() %>% count() # 385
## # A tibble: 1 × 1
## n
## <int>
## 1 385
stations_2020_end <- table(data_2020$end_station_name)
stations_2020_df_end <- as.data.frame(stations_2020_end) %>% as_tibble()
order(stations_2020_df_end$Freq)
## [1] 250 254 33 289 288 383 49 215 208 134 95 34 232 184 365 30 382 350
## [19] 159 264 231 227 282 174 168 176 6 154 118 73 341 251 330 229 234 361
## [37] 278 368 92 48 75 338 366 238 25 376 363 158 138 94 321 258 18 312
## [55] 266 253 109 375 339 62 186 167 50 359 42 107 294 354 221 124 260 150
## [73] 332 53 245 13 36 335 2 239 287 88 43 166 379 57 308 99 24 149
## [91] 172 353 197 160 385 74 293 265 140 155 98 315 163 136 26 170 333 8
## [109] 71 147 142 164 126 244 199 205 362 70 297 72 161 336 129 249 191 192
## [127] 381 10 351 334 301 31 331 316 261 241 355 114 52 80 203 143 104 60
## [145] 76 12 44 152 268 280 37 305 327 274 144 141 364 367 91 139 309 128
## [163] 196 290 372 32 224 54 299 369 386 157 200 93 137 226 145 216 314 242
## [181] 273 78 207 257 228 371 283 300 61 384 7 77 284 235 131 307 240 162
## [199] 252 206 165 17 40 292 387 16 303 21 347 263 272 81 106 68 15 27
## [217] 171 373 380 39 225 3 201 322 87 14 46 59 117 148 323 11 69 194
## [235] 187 279 370 41 151 4 198 337 218 286 230 310 121 116 169 324 185 110
## [253] 120 326 122 209 173 220 175 23 119 105 295 320 189 356 204 180 329 156
## [271] 325 277 328 345 101 269 111 79 344 318 108 153 82 311 113 270 358 47
## [289] 100 183 291 146 177 84 377 306 86 83 237 313 58 349 342 55 9 133
## [307] 281 348 195 236 360 256 188 67 346 97 213 178 340 212 56 115 179 202
## [325] 66 317 63 181 285 130 378 276 28 29 19 343 5 304 217 211 51 45
## [343] 1 135 35 22 296 352 357 125 132 193 271 190 233 319 64 267 123 102
## [361] 214 302 298 112 210 374 262 219 85 247 89 223 248 275 255 246 222 38
## [379] 259 127 182 65 20 243 103 96 90
top38_2020_end <- stations_2020_df_end[c(125, 132, 193, 271, 190, 233, 319, 64, 267, 123, 102, 214, 302, 298, 112, 210, 374, 262, 219, 85, 247, 89, 223, 248, 275, 255, 246, 222, 38, 259, 127, 182, 65, 20, 243, 103, 96, 90),]
top38_2020_end <- top38_2020_end %>% rename("end_station_name" = "Var1")
top38_2020_end <- left_join(top38_2020_end, unique_end_2020, by = "end_station_name")
coords_start_2020 <- top38_2020_start %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)
coords_end_2020 <- top38_2020_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)
coords_2020 <- cbind(st_coordinates(coords_start_2020$geometry),st_coordinates(coords_end_2020$geometry))
linestrings_2020 = st_sfc(
lapply(1:nrow(coords_2020),
function(i){
st_linestring(matrix(coords_2020[i,],ncol=2,byrow=TRUE))
}))
linestrings_2020 <- linestrings_2020 %>% st_set_crs(4326)
plot(linestrings_2020)
st_length(linestrings_2020) %>% mean() # 2137.295 m = 2,1 km
## 2137.295 [m]
leaflet(linestrings_2020) %>% addTiles() %>%
setView(lng = -71.08083, lat = 42.361145, zoom = 13) %>%
addPolylines() %>%
addCircleMarkers(lng = top38_2020_end$end_long, lat = top38_2020_end$end_lat, color = "red", popup = paste0("<strong>End station: </strong>", top36_2019_end$end_station_name)) %>%
addCircleMarkers(lng = top38_2020_start$start_long, lat = top38_2020_start$start_lat, popup = paste0("<strong>Start station: </strong>", top36_2019$start_station_name)) %>% addScaleBar()
Seeing as I have plotted the routes in euclidian space, they do not resemble the real routes the users took very much. However, we are able to say something about where the users move between. It does not seem to be the case that there is a trend from north to south or from vest to east. Since most dots are purple, it seems that the users move within this space in all directions. However, very tentitively, the two red dots are below the river which could indicate a from north to south migration, which one could explore with more points and routes.
Let’s explore the elevation of the city which also can have an impact of the movement opportunities the city offers.
# read in elevation raster
elevation <- raster("../data/_ags_ff2f2033_740d_47f5_8f3e_61b68c7d95f1.tif")
# read in the neighborhoods
neighborhoods <- st_read("../data/Boston_Neighborhoods/Boston_Neighborhoods.shp")
## Reading layer `Boston_Neighborhoods' from data source
## `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Boston_Neighborhoods/Boston_Neighborhoods.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 26 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 739715.3 ymin: 2908294 xmax: 812255.1 ymax: 2970086
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_neighborhoods <- st_transform(neighborhoods, 4326)
# define colors
pal <- colorNumeric(c("#A52A2A", "#90A926", "#55EA13"), values(elevation),
na.color = "transparent")
# plot raster on map
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.067083, lat = 42.330145, zoom = 11.5) %>% addRasterImage(elevation, colors = pal, opacity = 0.8) %>% addLegend(pal = pal, values = values(elevation),
title = "Elevation in meters above sea level") %>%
addPolygons(stroke = TRUE, fillColor = "transparent", color = "black", weight = 1) %>% addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat, color = "black") %>% addScaleBar()
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded ellps WGS 84 in Proj4 definition: +proj=merc +a=6378137
## +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null
## +wktext +no_defs +type=crs
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum World Geodetic System 1984 in Proj4 definition
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded ellps WGS 84 in Proj4 definition: +proj=merc +a=6378137
## +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null
## +wktext +no_defs +type=crs
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum World Geodetic System 1984 in Proj4 definition
There is a lot of elevation in the southern part of the city which probably also partly can explain the fact that there are less points there. The most used stations are centered in downtown Boston where there is little elevation.
Extract elevation for stations
# remove outliers if there are any
points <- unique_start %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
# transform points to sf object
points_sf <- st_as_sf(points, coords = c("start_long", "start_lat"), crs = 4326)
# extract the elevations of the stations
points_sf$elevation <- raster::extract(elevation, points_sf)
## Warning in .local(x, y, ...): Transforming SpatialPoints to the crs of the
## Raster
# Look at the points and extraction results - add polygon on top for scale
plot(points_sf["elevation"])
# transform crs of points and neighborhoods to that of elevation raster
points_elevation <- st_transform(points_sf, crs = crs(elevation, asText = TRUE))
neighborhoods_elevation <- st_transform(neighborhoods, crs = crs(elevation, asText = TRUE))
# plot the three object together
plot(elevation)
plot(points_elevation$geometry, add = TRUE)
plot(neighborhoods_elevation$geometry, border = "black", add = TRUE)
# histogram of frequency of elevation
ggplot(points_sf, aes(x = elevation)) + geom_histogram() + labs(x="Elevation", y = "Frequency") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
ggsave("../data_outputs/elevation_histogram.jpg")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
The first plot of the elevation of the stations show that there are many more stations located in areas with low elevation, and the second plot with the neighborhood polygon overlay confirms this.
We can see from the histogram that there are more points with lower elevation than higher, and from the plots that there is a trend towards the north of lower elevation and higher elevation towards the south. The histogram also tells us that there are no points in the areas with the highest elevation. The highest elevation for the points is 50 m. Therefore, Boston does have variation in elevation, however the users of the bikes do not move within the spaces of most elevation.
Get more routes - all unique stations of 2019
coords_start_2019_all <- unique_start %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)
coords_end_2019_all <- unique_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)
# find differences between the two objects
end <- coords_end_2019_all$end_station_name %>% as_tibble()
start <- coords_start_2019_all$start_station_name %>% as_tibble()
diffdf(end, start)
## Warning in diffdf(end, start):
## There are rows in BASE that are not in COMPARE !!
## Not all Values Compared Equal
## Differences found between the objects!
##
## A summary is given below.
##
## There are rows in BASE that are not in COMPARE !!
## All rows are shown in table below
##
## ===============
## ..ROWNUMBER..
## ---------------
## 407
## 408
## ---------------
##
## Not all Values Compared Equal
## All rows are shown in table below
##
## =============================
## Variable No of Differences
## -----------------------------
## value 396
## -----------------------------
##
##
## First 10 of 396 rows are shown in table below
##
## ===============================================================================================
## VARIABLE ..ROWNUMBER.. BASE COMPARE
## -----------------------------------------------------------------------------------------------
## value 1 Kenmore Square Dartmouth St at Newbury St
## value 2 MIT at Mass Ave / Amherst St MIT Stata Center at Vassar St ...
## value 3 Verizon Innovation Hub 10 Ware... Inman Square at Springfield St...
## value 4 Sidney Research Campus/ Erie S... Third at Binney
## value 5 Harvard Law School at Mass Ave... Verizon Innovation Hub 10 Ware...
## value 6 Inman Square at Springfield St... University Park
## value 7 Lower Cambridgeport at Magazin... One Kendall Square at Hampshir...
## value 8 Elm St at White St 359 Broadway - Broadway at Fay...
## value 9 Packard's Corner - Commonwealt... Prudential Center - 101 Huntin...
## value 10 Sydney St at Carson St Encore
## -----------------------------------------------------------------------------------------------
# remove rows with differences
coords_end_2019_all <- coords_end_2019_all[c(0-406),]
coords_end_2019_all <- coords_end_2019_all[c(0-406),]
coords_2019_all <- cbind(st_coordinates(coords_start_2019_all$geometry), st_coordinates(coords_end_2019_all$geometry))
# get all routes in 2019
linestrings_2019_all <- st_sfc(
lapply(1:nrow(coords_2019_all),
function(i){
st_linestring(matrix(coords_2019_all[i,], ncol=2, byrow=TRUE))
})
)
plot(linestrings_2019_all)
linestrings_2019_all <- linestrings_2019_all %>% st_set_crs(4326)
lengths <- st_length(linestrings_2019_all)
lengths %>% mean() # 4811.787 == 4,8 km
## 4811.787 [m]
lengths %>% median() # 4411.232 == 4.4 km
## 4411.232 [m]
#lengths <- lengths %>% as_tibble()
lengths <- lengths %>% as.numeric()
lengths <- lengths %>% as_tibble()
ggplot(lengths, aes(x = value)) + geom_histogram() + labs(x="Length in meters", y = "Frequency") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("../data_outputs/routes_histogram.jpg")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram visually confirms that the median length closely matches the mean length.